This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──
✔ ggplot2 3.3.5 ✔ purrr 0.3.4
✔ tibble 3.1.0 ✔ dplyr 1.0.9
✔ tidyr 1.2.0 ✔ stringr 1.4.0
✔ readr 2.1.2 ✔ forcats 0.5.1
package ‘tidyr’ was built under R version 4.0.5package ‘readr’ was built under R version 4.0.5── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
library(survival)
library(survminer)
Loading required package: ggpubr
library(cmprsk)
package ‘cmprsk’ was built under R version 4.0.5
library(tidyverse)
library(caret)
Loading required package: lattice
Attaching package: ‘caret’
The following object is masked from ‘package:survival’:
cluster
The following object is masked from ‘package:purrr’:
lift
library(survival)
library(survminer)
library(lubridate)
Attaching package: ‘lubridate’
The following objects are masked from ‘package:base’:
date, intersect, setdiff, union
ML_edited_features = read_tsv("/Users/noa/Workspace/raxml_deep_learning_results/ready_raw_data/All_data/ML_edited_features.tsv")
New names:Rows: 439338 Columns: 92── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (9): msa_path, msa_path_x, starting_tree_object, starting_tree_type, type, final_tree_topology, msa_path_y, msa_name, feature_msa_type
dbl (82): ...1, Unnamed: 0, Unnamed: 0.1, Unnamed: 0.1.1, Unnamed: 0.1.1.1, Unnamed: 0.1.1.1.1, Unnamed: 0.1.1.1.1.1, Unnamed: 0.1.1.1.1...
lgl (1): starting_tree_bool
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
raw_final_data = read_tsv("/Users/noa/Workspace/raxml_deep_learning_results/ready_raw_data/All_data/raw_final_performance.tsv")
New names:Rows: 820 Columns: 13── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (2): msa_path, starting_tree_type
dbl (11): ...1, starting_tree_ind, spr_radius, spr_cutoff, predicted_failure_probabilities, delta_ll_from_overall_msa_best_topology, tre...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
model_performance = read_tsv("/Users/noa/Workspace/raxml_deep_learning_results/ready_raw_data/All_data/final_performance_comp.tsv")
New names:Rows: 103 Columns: 18── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): msa_path
dbl (17): ...1, total_time_predicted, total_actual_time, status, diff, pct_global_max, mean_diff, n_distinct_topologies, n_trees_used, U...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
model_performance<- model_performance %>% mutate (time_imprv = curr_sample_total_time/total_actual_time, accuracy_diff =diff-curr_sample_err )
Summary statistics
ML_edited_features %>% distinct (msa_path)
ML_edited_features %>% distinct (spr_radius)
ML_edited_features %>% distinct (spr_cutoff)
per_msa_features<- ML_edited_features %>% distinct(msa_path, feature_n_loci, feature_n_seq )
per_msa_features %>% ggplot(aes(x=feature_n_seq)) + geom_histogram(color="darkblue", fill="purple")+
labs(title="Number of sequences",x="Number of sequences", y = "Count")+theme(axis.text=element_text(size=15))
per_msa_features %>% ggplot(aes(x=feature_n_loci)) + geom_histogram(color="darkblue", fill="orange")+
labs(title="Number of MSA positions",x="Number of MSA positions", y = "Count")+theme(axis.text=element_text(size=15))
Efficiency
model_performance %>% ggplot(aes(x=time_imprv)) + geom_histogram(color="darkblue", fill="lightblue")+
labs(title="Running-time improvement between default cofnfiguration to ML-based tree search",x="Running-time improvement", y = "Count")+theme(axis.text=element_text(size=14),axis.title=element_text(size=14,face="bold"))
summary(model_performance %>% pull(time_imprv))
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.8004 1.6032 5.0377 5.6808 8.4668 26.3827
Accuracy
model_performance %>% ggplot(aes(x=accuracy_diff)) + geom_histogram(color="darkblue", fill="pink")+
labs(title="Log-likelihood difference between default cofnfiguration to ML-based tree search",x="Log-likelihood diff", y = "Count")+theme(axis.text=element_text(size=14),axis.title=element_text(size=14,face="bold"))
summary(model_performance %>% pull(diff))
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000000 0.000000 0.000000 0.199719 0.000248 4.829481
summary(model_performance %>% pull(curr_sample_err))
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000000 0.000000 0.000000 0.087392 0.003121 3.530084
summary(ML_edited_features %>% pull(delta_ll_from_overall_msa_best_topology))
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00000 0.00000 0.00017 1.47640 1.48819 144.51634
Overall performance
model_performance<- model_performance %>% mutate (is_better_time = time_imprv>1, is_more_accurate = accuracy_diff<=0)
model_performance %>% group_by(is_better_time,is_more_accurate) %>% count()
model_performance<- model_performance %>% mutate (is_better_time = time_imprv>1, is_more_accurate = accuracy_diff<=0)
model_performance %>% group_by(is_better_time,is_more_accurate) %>% summarise(median_accuracy_diff = median(accuracy_diff) , median_running_time_imprv = median(time_imprv))
`summarise()` has grouped output by 'is_better_time'. You can override using the `.groups` argument.
ML-chosen parameters
raw_final_data %>% group_by(msa_path) %>% count() %>% ggplot(aes(x=n)) + geom_histogram(color = "purple", fill = "lightblue")+
labs(title="",x="Number of starting trees", y = "Count")+theme(axis.text=element_text(size=14),axis.title=element_text(size=14,face="bold"))
raw_final_data %>% group_by(msa_path, starting_tree_type) %>% count() %>% ungroup() %>% group_by(msa_path) %>% mutate(total_size = sum(n)) %>% filter (starting_tree_type=="pars") %>% mutate (pct_parsimony = n/total_size) %>% ggplot(aes(x=pct_parsimony)) + geom_histogram(color = "purple", fill = "lightblue")+
labs(title="",x="Fraction of parsimnoy trees", y = "Count")+theme(axis.text=element_text(size=14),axis.title=element_text(size=14,face="bold"))
raw_final_data %>% ggplot(aes(x=spr_radius)) + geom_histogram(color="darkgreen", fill="green")+
labs(title="",x="SPR radius", y = "Count")+theme(axis.text=element_text(size=14),axis.title=element_text(size=14,face="bold"))
raw_final_data %>% ggplot(aes(x=spr_cutoff)) + geom_histogram(color="orange", fill="yellow")+
labs(title="",x="SPR cutoff", y = "Count")+theme(axis.text=element_text(size=14),axis.title=element_text(size=14,face="bold"))
Correlation between features
Pypythia
ML_edited_features %>% ggplot(aes(x = feature_pypythia_msa_difficulty)) + geom_histogram(fill = "purple")
count_per_starting_tree<-ML_edited_features %>% group_by(msa_path,starting_tree_ind, starting_tree_type,tree_clusters_ind,feature_pypythia_msa_difficulty) %>% count() %>% ungroup() %>% group_by(msa_path,feature_pypythia_msa_difficulty, starting_tree_ind,starting_tree_type) %>% count()
summary(count_per_starting_tree %>% pull(n))
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.000 1.000 2.000 4.029 5.000 31.000
count_per_starting_tree_per_MSA<- count_per_starting_tree %>% group_by(msa_path, starting_tree_type,feature_pypythia_msa_difficulty) %>% summarise(median_per_tree = median(n))
`summarise()` has grouped output by 'msa_path', 'starting_tree_type'. You can override using the `.groups` argument.
var_per_starting_tree_per_MSA<- count_per_starting_tree %>% group_by(msa_path, starting_tree_type,feature_pypythia_msa_difficulty) %>% summarise(var_across_trees = var(n))
`summarise()` has grouped output by 'msa_path', 'starting_tree_type'. You can override using the `.groups` argument.
count_per_starting_tree_per_MSA %>% ggplot(aes(x = median_per_tree, fill = starting_tree_type))+ geom_histogram(position = position_dodge(), bins = 10)
var_per_starting_tree_per_MSA %>% ggplot(aes(x = var_across_trees, fill = starting_tree_type))+ geom_histogram(position = position_dodge(), bins = 10)
count_per_starting_tree_per_MSA %>% ggplot(aes(x = median_per_tree, y = feature_pypythia_msa_difficulty, color = starting_tree_type))+ geom_point()
var_per_starting_tree_per_MSA %>% ggplot(aes(x = var_across_trees, y = feature_pypythia_msa_difficulty, color = starting_tree_type))+ geom_point()
lm1<- lm(median_per_tree~(feature_pypythia_msa_difficulty)*starting_tree_type , data =count_per_starting_tree_per_MSA )
summary(lm1)
Call:
lm(formula = median_per_tree ~ (feature_pypythia_msa_difficulty) *
starting_tree_type, data = count_per_starting_tree_per_MSA)
Residuals:
Min 1Q Median 3Q Max
-7.6335 -1.5321 -0.4105 0.7579 17.9380
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.03144 0.36520 -0.086 0.931
feature_pypythia_msa_difficulty 6.53084 0.97227 6.717 3.89e-11 ***
starting_tree_typerand -0.07326 0.51647 -0.142 0.887
feature_pypythia_msa_difficulty:starting_tree_typerand 10.95047 1.37500 7.964 6.93e-15 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.168 on 686 degrees of freedom
Multiple R-squared: 0.461, Adjusted R-squared: 0.4586
F-statistic: 195.6 on 3 and 686 DF, p-value: < 2.2e-16
plot(fitted(lm1),resid(lm1))
lm2<- lm(var_across_trees~feature_pypythia_msa_difficulty*starting_tree_type , data =var_per_starting_tree_per_MSA )
summary(lm2)
Call:
lm(formula = var_across_trees ~ feature_pypythia_msa_difficulty *
starting_tree_type, data = var_per_starting_tree_per_MSA)
Residuals:
Min 1Q Median 3Q Max
-4.2079 -1.2399 -0.2905 0.6443 19.5277
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.2362 0.2494 -4.957 9.03e-07 ***
feature_pypythia_msa_difficulty 7.0589 0.6639 10.632 < 2e-16 ***
starting_tree_typerand 0.4716 0.3527 1.337 0.18157
feature_pypythia_msa_difficulty:starting_tree_typerand 3.0914 0.9389 3.292 0.00104 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.163 on 686 degrees of freedom
Multiple R-squared: 0.385, Adjusted R-squared: 0.3824
F-statistic: 143.2 on 3 and 686 DF, p-value: < 2.2e-16
count_per_final_tree_topology<- ML_edited_features %>% group_by(msa_path,starting_tree_ind, starting_tree_type,tree_clusters_ind) %>% count() %>% ungroup() %>% group_by(msa_path,feature_pypythia_msa_difficulty, starting_tree_ind,starting_tree_type) %>% count()
count_per_SPR_radius<-ML_edited_features %>% group_by(msa_path,spr_radius, starting_tree_type,tree_clusters_ind) %>% count() %>% ungroup() %>% group_by(msa_path,spr_radius, starting_tree_type) %>% count()
count_per_SPR_radius %>% ggplot(aes(x = n, fill = starting_tree_type))+ geom_histogram()+facet_grid(rows = vars(spr_radius))
count_per_SPR_cutoff<-ML_edited_features %>% group_by(msa_path,spr_cutoff, starting_tree_type,tree_clusters_ind) %>% count() %>% ungroup() %>% group_by(msa_path,spr_cutoff, starting_tree_type) %>% count()
count_per_SPR_cutoff %>% ggplot(aes(x = n, fill = starting_tree_type))+ geom_histogram()+facet_grid(rows = vars(spr_cutoff))
summary(count_per_SPR_radius %>% pull(n))
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.000 1.000 1.000 1.572 2.000 6.000
example<- ML_edited_features %>% filter (msa_path=='/groups/pupko/noaeker/data/ABC_DR/PANDIT/PF00005/ref_msa.aa.phy')
ML_edited_features %>% distinct (msa_path, starting_tree_ind,starting_tree_ll,tree_clusters_ind,feature_mean_branch_length,feature_mean_internal_branch_length,feature_mean_leaf_branch_length,feature_tree_MAD,feature_mean_rf_distance)
ML_edited_features %>% filter (msa_path=="/groups/pupko/noaeker/data/ABC_DR/PANDIT/PF00005/ref_msa.aa.phy")
tree_features_analysis = read_tsv("/Users/noa/Workspace/raxml_deep_learning_results/ready_raw_data/All_data/tree_comparisons.tsv")
New names:Rows: 407853 Columns: 50── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (9): msa_path, starting_tree_object, final_tree_topology, starting_tree_type, feature_msa_type, msa_path_other, starting_tree_obje...
dbl (41): ...1, starting_tree_ind, delta_ll_from_overall_msa_best_topology, final_ll, starting_tree_ll, feature_mean_branch_length, fea...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
tree_features_analysis %>% head()
Error in h(simpleError(msg, call)) :
error in evaluating the argument 'x' in selecting a method for function 'head': object 'tree_features_analysis' not found
tree_features_analysis %>% head(5)
tree_features_analysis_edited<- tree_features_analysis %>% mutate (LL_diff = delta_ll_from_overall_msa_best_topology_other-delta_ll_from_overall_msa_best_topology, starting_tree_ll_diff = starting_tree_ll- starting_tree_ll_other) %>% mutate (is_better = LL_diff>0.1)
tree_features_analysis_edited %>% ggplot(aes(x = rf_dist_starting_trees, y= rf_dist_final_trees)) + geom_point()+ facet_grid(rows = vars(starting_tree_type), cols = vars(starting_tree_type_other))
NA
NA
tree_features_analysis_edited %>% filter (starting_tree_type==starting_tree_type_other,) %>% ggplot(aes(y = (LL_diff), x=(starting_tree_ll_diff))) + geom_point()+ facet_grid(rows = vars(starting_tree_type))
tree_features_analysis_edited %>% filter (starting_tree_type==starting_tree_type_other,) %>% ggplot(aes(y = (rf_dist_final_trees), x=abs((starting_tree_ll_diff)))) + geom_point()+ facet_grid(rows = vars(starting_tree_type))
data_for_ML<-tree_features_analysis_edited %>% select (-starting_tree_ind,-msa_path_other , -starting_tree_ind_other, -starting_tree_object, -starting_tree_object_other,-delta_ll_from_overall_msa_best_topology_other, -final_ll_other, -final_tree_topology, -final_tree_topology_other,-starting_tree_type,-starting_tree_type_other ,-feature_msa_type,-LL_diff,-rf_dist_final_trees,-delta_ll_from_overall_msa_best_topo )
msas = tree_features_analysis_edited %>% distinct (msa_path) %>% pull(msa_path)
test_sampled_msas = msas[sample(1:length(msas),20)]
test<- data_for_ML %>% filter (msa_path %in% test_sampled_msas) %>% select (-msa_path)
train<- data_for_ML %>% filter (!(msa_path %in% test_sampled_msas)) %>% select (-msa_path)
bin_glm<- glm(is_better ~ . , data = train, family = "binomial")
caret::varImp(bin_glm)
nullmod <- glm(is_better ~1,data = train , family="binomial")
r2 = 1-logLik(bin_glm)/logLik(nullmod)
print(r2)
summary(bin_glm)
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.
summary(train)